** Further PSID cleaning for age, sex, and samehead 
** for The Effects of Changes in Local Bank Health on Household Consumption
** by Daniel Cooper and Joe Peek, ReStat 2020
/* ////////////////////////////////////////////////////////////////////////// */
clear all
set more off

* this code is divided into blocks, see notes on each block below
scalar agesex = 0
scalar samehead = 0

* update with relevant file path
local psid "~/PSID_new"

////////////////////////////////////////////////////////////////////////////////
**** I: AGE, SEX ***************************************************************
////////////////////////////////////////////////////////////////////////////////
/* Get age and sex from individual PSID files -- more accurate than the household-level data in the 
family file where the "head" observations have age and gender information for heads and spouses (if any) 
head: identify heads and take age and sex,
wives: identify wives. assign their age information to their heads observation 
	sex is always female (2)
There are additional reporting issues with age data that are fixed, for example, in psid_ageearnings*/

if agesex { 
////////////////////////////////////////////////////////////////////////////////
*HEAD
* individual data
use `psid'/movein_out_upd15, clear

*extend sex forward to 2013 and 2015 (summary variable - doesn't change year to year)
xtset unique year
carryforward sex, replace

*keep heads
keep if (relhead == 1 & seqno == 1 & year < 1983) | (relhead == 10 & seqno == 1 & year >=1983)

ren age ageh 
ren sex sexh 

keep year unique ageh sexh 

tempfile head 
save `head' 

////////////////////////////////////////////////////////////////////////////////
*WIFE
* individual data
use `psid'/movein_out_upd15, clear 

*gen famid68 = floor(unique/1000)

*keep only heads and wives 
keep if (relhead == 1 & seqno == 1 & year < 1983) | (relhead == 10 & seqno == 1 & year >=1983) ///
| (relhead == 2 & year < 1983 & seqno == 2 )  | ((relhead == 20 | relhead == 22) & seqno == 2 & year >= 1983)

g wife = 1 if (relhead == 2 & year < 1983 & seqno == 2) | ((relhead == 20 | relhead == 22 & seqno == 2) & year >= 1983)
g head = 1 if (relhead == 1 & seqno == 1 & year < 1983) | (relhead == 10 & seqno == 1 & year >=1983)
*not keeping wives that don't have seqno 2 -- checked main data and heads don't have wife data in those years 

*count number of people in a household group in a year
bys year famid: egen members = count(unique)

count if members == 1 & wife == 1
*0. good. no wives without a head

*want to get the wives age - extend to head as well 
sort year famid wife 
by year famid: g age2 = age[_n-1] if _n != 1 

*drop wives
drop if wife == 1

replace age = age2 
ren age agew

keep unique year agew 

tempfile wife
save `wife'

////////////////////////////////////////////////////////////////////////////////
*MERGE

*merge with head 
merge 1:1 unique year using `head' 

keep unique year agew ageh sexh 

save `psid'/psid_age_sex_correct, replace
}

////////////////////////////////////////////////////////////////////////////////
**** II: SAMEHEAD ***************************************************************
////////////////////////////////////////////////////////////////////////////////
/* The PSID tracks whether the household head is the same between waves (and whether there is a newwife).
This code corrects samehead when it appears to change temporarily and then goes back to the previous head--that 
is it is unlikely that head has actually changed.  

  When a HH jumps in and out of the PSID, then the samehead varaible is sometimes recoded to 0 (not same head)
	(and equivalent for newwife) We change it back to 1 if head's age stays on the same age path after jump out/in. 

Generally, we trust the samehead variable, but if household comes comes back in after jump in/out of PSID and is on a different
age path  or samehead changes and is on a different age path, 
then assume different head.

Note that the age path data used are not near the final age cleaning that happens in psid_ageearnings 
	and in main consumption data
 */

if samehead {
use `psid'/Demographic/psid_demographic_fin, clear

*merge in correct age and sex variable from individual files 
drop ageh sex agew 

merge 1:1 unique year using `psid'/psid_age_sex_correct, keep(3) nogen
drop if year < 1980
*no samehead data before that, so hard to tell if jumps are ok 

////////////////////////////////////////////////////////////////////////////////

*recode to samehead so variable consistent across do files
g samehead = 1 if changehh == 1
replace samehead = 0 if changehh == 2 
replace samehead = . if changehh == 9

g newwife = 1 if changewf == 1
replace newwife = . if changewf == 0
replace newwife = 0 if changewf == 5

////////////////////////////////////////////////////////////////////////////////

*mark if jump in and out of psid 
xtset unique year

by unique: g yr_gap = 1 if year != L.year + 1 & year < 1998 & _n != 1
by unique: replace yr_gap = 1 if year != L2.year + 2 & year > 1998 & _n != 1

////////////////////////////////////////////////////////////////////////////////
** this is a slightly abbreviated version of the age path fixes that happen in the age
* earnings do file.  (see it for more details on the approach).  Do this here only for purposes
* of determining samehead/wife, age data donot actually get saved here and passed elsewhere.

*create age path:: HEAD
replace ageh = . if ageh == 999

*creating birth year for first observation
sort unique year
by unique: g birthyear = year[1] - ageh[1]
by unique: replace birthyear = year[2] - ageh[2] if ageh[2] != . & ageh[1] == .  
by unique: replace birthyear = year[3] - ageh[3] if ageh[3] != . & ageh[2] == . & ageh[1] == . 
	*stop getting any changes after 3 
	*making sure get an birth year even if first few ageh observations are missing 

*assign minimum birthyear that exists in each unique. sometimes vary because age doesn't change in two years due to interview month 
bysort unique: ereplace birthyear = min(birthyear)

*create age path based on birthyear
g age_path = year - birthyear 

*count how often households are off of their path -- if first obs wrong, then entire path wrong
g off_path = 1 if ageh != age_path & ageh != age_path + 1 & ageh != age_path - 1
by unique: egen all_off_path = sum(off_path)

*count what fraction of observations are off the path 
bys unique: egen obs = count(unique)
g off_path_frac = all_off_path / obs 


*create a second age path for people whos off_path fraction is greater than 1/2 
sort unique year
by unique: g birthyear2 = year[2] - ageh[2] if off_path_frac > .5
by unique: replace birthyear2 = year[3] - ageh[3] if ageh[3] != . & ageh[2] == . & ageh[1] == .  & off_path_frac > .5

*assign minimum birthyear that exists in each unique. sometimes vary because age doesn't change in two years due to interview month 
bysort unique: ereplace birthyear2 = min(birthyear2)

*create age path based on birthyear
g age_path2 = year - birthyear2

*a deviation from this second path is nearly always a new head

/////////////////////////////////////////////

*create age path:: WIFE
replace agew = . if agew == 999

*creating birth year for first observation
sort unique year
by unique: g birthyearw = year[1] - agew[1]
by unique: replace birthyearw = year[2] - agew[2] if agew[2] != . & agew[1] == .  
by unique: replace birthyearw = year[3] - agew[3] if agew[3] != . & agew[2] == . & agew[1] == . 
	*stop getting any changes after 3 
	*making sure get an birth year even if first few agew observations are missing 

*assign minimum birthyear that exists in each unique. sometimes vary because age doesn't change in two years due to interview month 
bysort unique: ereplace birthyearw = min(birthyearw)

replace birthyearw = . if agew == .
*means there is no wife in that year. 

*create age path based on birthyear
g age_pathw = year - birthyearw 

*count how often households are off of their path -- if first obs wrong, then entire path wrong
g off_pathw = 1 if agew != age_pathw & agew != age_pathw + 1 & agew != age_pathw - 1
by unique: egen all_off_pathw = sum(off_pathw)

*count what fraction of observations are off the path 
g off_path_fracw = all_off_pathw / obs 

*create a second age path for people whos off_path fraction is greater than 1/2 
sort unique year
by unique: g birthyearw2 = year[2] - agew[2] if off_path_fracw > .5
by unique: replace birthyearw2 = year[3] - agew[3] if agew[3] != . & agew[2] == . & agew[1] == .  & off_path_fracw > .5

*assign minimum birthyear that exists in each unique. sometimes vary because age doesn't change in two years due to interview month 
bysort unique: ereplace birthyearw2 = min(birthyearw2)

replace birthyearw2 = . if agew == . 

*create age path based on birthyear
g age_pathw2 = year - birthyearw2

////////////////////////////////////////////////////////////////////////////////

* evaluate where there are deviations from age path that seem ok versus not for head and wives
* assign new head/wives if there seems to be a true change in age/gender

*when there is a year gap, is it still on the age path? Is sex the same? 
	*note can deviate from age path by one and ok because of inteview month vs. birthday month timing 
g age_ok = 1 if yr_gap == 1 & (ageh == age_path | ageh == age_path + 1 | ageh == age_path - 1) & off_path_frac <= .5
replace age_ok = 1 if yr_gap ==1 & (ageh == age_path2 | ageh == age_path2 + 1 | ageh == age_path2 - 1) & off_path_frac > .5

g age_okw = 1 if yr_gap == 1 & (agew == age_pathw | agew == age_pathw + 1 | agew == age_pathw - 1) & off_path_fracw <= .5
replace age_okw = 1 if yr_gap ==1 & (agew == age_pathw2 | agew == age_pathw2 + 1 | agew == age_pathw2 - 1) & off_path_fracw > .5

*make sure sex doesn't jump
bysort unique: egen sexh_ave = mean(sexh)
levelsof sexh_ave
* 1 and 2 -- consistent. don't need to worry about this. 
*sex wife always female

*if head changes a bunch and year jumps, then replace samehead
replace samehead = 1 if yr_gap == 1 & samehead == 0 & age_ok == 1 
replace newwife = 0 if yr_gap == 1 & newwife == 1 & age_okw == 1

keep unique year samehead newwife

save `psid'/samehead_fix, replace 
}
